lexer.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866
  1. """Implements a Jinja / Python combination lexer. The ``Lexer`` class
  2. is used to do some preprocessing. It filters out invalid operators like
  3. the bitshift operators we don't allow in templates. It separates
  4. template code and python code in expressions.
  5. """
  6. import re
  7. import typing as t
  8. from ast import literal_eval
  9. from collections import deque
  10. from sys import intern
  11. from ._identifier import pattern as name_re
  12. from .exceptions import TemplateSyntaxError
  13. from .utils import LRUCache
  14. if t.TYPE_CHECKING:
  15. import typing_extensions as te
  16. from .environment import Environment
  17. # cache for the lexers. Exists in order to be able to have multiple
  18. # environments with the same lexer
  19. _lexer_cache: t.MutableMapping[t.Tuple, "Lexer"] = LRUCache(50) # type: ignore
  20. # static regular expressions
  21. whitespace_re = re.compile(r"\s+")
  22. newline_re = re.compile(r"(\r\n|\r|\n)")
  23. string_re = re.compile(
  24. r"('([^'\\]*(?:\\.[^'\\]*)*)'" r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S
  25. )
  26. integer_re = re.compile(
  27. r"""
  28. (
  29. 0b(_?[0-1])+ # binary
  30. |
  31. 0o(_?[0-7])+ # octal
  32. |
  33. 0x(_?[\da-f])+ # hex
  34. |
  35. [1-9](_?\d)* # decimal
  36. |
  37. 0(_?0)* # decimal zero
  38. )
  39. """,
  40. re.IGNORECASE | re.VERBOSE,
  41. )
  42. float_re = re.compile(
  43. r"""
  44. (?<!\.) # doesn't start with a .
  45. (\d+_)*\d+ # digits, possibly _ separated
  46. (
  47. (\.(\d+_)*\d+)? # optional fractional part
  48. e[+\-]?(\d+_)*\d+ # exponent part
  49. |
  50. \.(\d+_)*\d+ # required fractional part
  51. )
  52. """,
  53. re.IGNORECASE | re.VERBOSE,
  54. )
  55. # internal the tokens and keep references to them
  56. TOKEN_ADD = intern("add")
  57. TOKEN_ASSIGN = intern("assign")
  58. TOKEN_COLON = intern("colon")
  59. TOKEN_COMMA = intern("comma")
  60. TOKEN_DIV = intern("div")
  61. TOKEN_DOT = intern("dot")
  62. TOKEN_EQ = intern("eq")
  63. TOKEN_FLOORDIV = intern("floordiv")
  64. TOKEN_GT = intern("gt")
  65. TOKEN_GTEQ = intern("gteq")
  66. TOKEN_LBRACE = intern("lbrace")
  67. TOKEN_LBRACKET = intern("lbracket")
  68. TOKEN_LPAREN = intern("lparen")
  69. TOKEN_LT = intern("lt")
  70. TOKEN_LTEQ = intern("lteq")
  71. TOKEN_MOD = intern("mod")
  72. TOKEN_MUL = intern("mul")
  73. TOKEN_NE = intern("ne")
  74. TOKEN_PIPE = intern("pipe")
  75. TOKEN_POW = intern("pow")
  76. TOKEN_RBRACE = intern("rbrace")
  77. TOKEN_RBRACKET = intern("rbracket")
  78. TOKEN_RPAREN = intern("rparen")
  79. TOKEN_SEMICOLON = intern("semicolon")
  80. TOKEN_SUB = intern("sub")
  81. TOKEN_TILDE = intern("tilde")
  82. TOKEN_WHITESPACE = intern("whitespace")
  83. TOKEN_FLOAT = intern("float")
  84. TOKEN_INTEGER = intern("integer")
  85. TOKEN_NAME = intern("name")
  86. TOKEN_STRING = intern("string")
  87. TOKEN_OPERATOR = intern("operator")
  88. TOKEN_BLOCK_BEGIN = intern("block_begin")
  89. TOKEN_BLOCK_END = intern("block_end")
  90. TOKEN_VARIABLE_BEGIN = intern("variable_begin")
  91. TOKEN_VARIABLE_END = intern("variable_end")
  92. TOKEN_RAW_BEGIN = intern("raw_begin")
  93. TOKEN_RAW_END = intern("raw_end")
  94. TOKEN_COMMENT_BEGIN = intern("comment_begin")
  95. TOKEN_COMMENT_END = intern("comment_end")
  96. TOKEN_COMMENT = intern("comment")
  97. TOKEN_LINESTATEMENT_BEGIN = intern("linestatement_begin")
  98. TOKEN_LINESTATEMENT_END = intern("linestatement_end")
  99. TOKEN_LINECOMMENT_BEGIN = intern("linecomment_begin")
  100. TOKEN_LINECOMMENT_END = intern("linecomment_end")
  101. TOKEN_LINECOMMENT = intern("linecomment")
  102. TOKEN_DATA = intern("data")
  103. TOKEN_INITIAL = intern("initial")
  104. TOKEN_EOF = intern("eof")
  105. # bind operators to token types
  106. operators = {
  107. "+": TOKEN_ADD,
  108. "-": TOKEN_SUB,
  109. "/": TOKEN_DIV,
  110. "//": TOKEN_FLOORDIV,
  111. "*": TOKEN_MUL,
  112. "%": TOKEN_MOD,
  113. "**": TOKEN_POW,
  114. "~": TOKEN_TILDE,
  115. "[": TOKEN_LBRACKET,
  116. "]": TOKEN_RBRACKET,
  117. "(": TOKEN_LPAREN,
  118. ")": TOKEN_RPAREN,
  119. "{": TOKEN_LBRACE,
  120. "}": TOKEN_RBRACE,
  121. "==": TOKEN_EQ,
  122. "!=": TOKEN_NE,
  123. ">": TOKEN_GT,
  124. ">=": TOKEN_GTEQ,
  125. "<": TOKEN_LT,
  126. "<=": TOKEN_LTEQ,
  127. "=": TOKEN_ASSIGN,
  128. ".": TOKEN_DOT,
  129. ":": TOKEN_COLON,
  130. "|": TOKEN_PIPE,
  131. ",": TOKEN_COMMA,
  132. ";": TOKEN_SEMICOLON,
  133. }
  134. reverse_operators = {v: k for k, v in operators.items()}
  135. assert len(operators) == len(reverse_operators), "operators dropped"
  136. operator_re = re.compile(
  137. f"({'|'.join(re.escape(x) for x in sorted(operators, key=lambda x: -len(x)))})"
  138. )
  139. ignored_tokens = frozenset(
  140. [
  141. TOKEN_COMMENT_BEGIN,
  142. TOKEN_COMMENT,
  143. TOKEN_COMMENT_END,
  144. TOKEN_WHITESPACE,
  145. TOKEN_LINECOMMENT_BEGIN,
  146. TOKEN_LINECOMMENT_END,
  147. TOKEN_LINECOMMENT,
  148. ]
  149. )
  150. ignore_if_empty = frozenset(
  151. [TOKEN_WHITESPACE, TOKEN_DATA, TOKEN_COMMENT, TOKEN_LINECOMMENT]
  152. )
  153. def _describe_token_type(token_type: str) -> str:
  154. if token_type in reverse_operators:
  155. return reverse_operators[token_type]
  156. return {
  157. TOKEN_COMMENT_BEGIN: "begin of comment",
  158. TOKEN_COMMENT_END: "end of comment",
  159. TOKEN_COMMENT: "comment",
  160. TOKEN_LINECOMMENT: "comment",
  161. TOKEN_BLOCK_BEGIN: "begin of statement block",
  162. TOKEN_BLOCK_END: "end of statement block",
  163. TOKEN_VARIABLE_BEGIN: "begin of print statement",
  164. TOKEN_VARIABLE_END: "end of print statement",
  165. TOKEN_LINESTATEMENT_BEGIN: "begin of line statement",
  166. TOKEN_LINESTATEMENT_END: "end of line statement",
  167. TOKEN_DATA: "template data / text",
  168. TOKEN_EOF: "end of template",
  169. }.get(token_type, token_type)
  170. def describe_token(token: "Token") -> str:
  171. """Returns a description of the token."""
  172. if token.type == TOKEN_NAME:
  173. return token.value
  174. return _describe_token_type(token.type)
  175. def describe_token_expr(expr: str) -> str:
  176. """Like `describe_token` but for token expressions."""
  177. if ":" in expr:
  178. type, value = expr.split(":", 1)
  179. if type == TOKEN_NAME:
  180. return value
  181. else:
  182. type = expr
  183. return _describe_token_type(type)
  184. def count_newlines(value: str) -> int:
  185. """Count the number of newline characters in the string. This is
  186. useful for extensions that filter a stream.
  187. """
  188. return len(newline_re.findall(value))
  189. def compile_rules(environment: "Environment") -> t.List[t.Tuple[str, str]]:
  190. """Compiles all the rules from the environment into a list of rules."""
  191. e = re.escape
  192. rules = [
  193. (
  194. len(environment.comment_start_string),
  195. TOKEN_COMMENT_BEGIN,
  196. e(environment.comment_start_string),
  197. ),
  198. (
  199. len(environment.block_start_string),
  200. TOKEN_BLOCK_BEGIN,
  201. e(environment.block_start_string),
  202. ),
  203. (
  204. len(environment.variable_start_string),
  205. TOKEN_VARIABLE_BEGIN,
  206. e(environment.variable_start_string),
  207. ),
  208. ]
  209. if environment.line_statement_prefix is not None:
  210. rules.append(
  211. (
  212. len(environment.line_statement_prefix),
  213. TOKEN_LINESTATEMENT_BEGIN,
  214. r"^[ \t\v]*" + e(environment.line_statement_prefix),
  215. )
  216. )
  217. if environment.line_comment_prefix is not None:
  218. rules.append(
  219. (
  220. len(environment.line_comment_prefix),
  221. TOKEN_LINECOMMENT_BEGIN,
  222. r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix),
  223. )
  224. )
  225. return [x[1:] for x in sorted(rules, reverse=True)]
  226. class Failure:
  227. """Class that raises a `TemplateSyntaxError` if called.
  228. Used by the `Lexer` to specify known errors.
  229. """
  230. def __init__(
  231. self, message: str, cls: t.Type[TemplateSyntaxError] = TemplateSyntaxError
  232. ) -> None:
  233. self.message = message
  234. self.error_class = cls
  235. def __call__(self, lineno: int, filename: str) -> "te.NoReturn":
  236. raise self.error_class(self.message, lineno, filename)
  237. class Token(t.NamedTuple):
  238. lineno: int
  239. type: str
  240. value: str
  241. def __str__(self) -> str:
  242. return describe_token(self)
  243. def test(self, expr: str) -> bool:
  244. """Test a token against a token expression. This can either be a
  245. token type or ``'token_type:token_value'``. This can only test
  246. against string values and types.
  247. """
  248. # here we do a regular string equality check as test_any is usually
  249. # passed an iterable of not interned strings.
  250. if self.type == expr:
  251. return True
  252. if ":" in expr:
  253. return expr.split(":", 1) == [self.type, self.value]
  254. return False
  255. def test_any(self, *iterable: str) -> bool:
  256. """Test against multiple token expressions."""
  257. return any(self.test(expr) for expr in iterable)
  258. class TokenStreamIterator:
  259. """The iterator for tokenstreams. Iterate over the stream
  260. until the eof token is reached.
  261. """
  262. def __init__(self, stream: "TokenStream") -> None:
  263. self.stream = stream
  264. def __iter__(self) -> "TokenStreamIterator":
  265. return self
  266. def __next__(self) -> Token:
  267. token = self.stream.current
  268. if token.type is TOKEN_EOF:
  269. self.stream.close()
  270. raise StopIteration
  271. next(self.stream)
  272. return token
  273. class TokenStream:
  274. """A token stream is an iterable that yields :class:`Token`\\s. The
  275. parser however does not iterate over it but calls :meth:`next` to go
  276. one token ahead. The current active token is stored as :attr:`current`.
  277. """
  278. def __init__(
  279. self,
  280. generator: t.Iterable[Token],
  281. name: t.Optional[str],
  282. filename: t.Optional[str],
  283. ):
  284. self._iter = iter(generator)
  285. self._pushed: "te.Deque[Token]" = deque()
  286. self.name = name
  287. self.filename = filename
  288. self.closed = False
  289. self.current = Token(1, TOKEN_INITIAL, "")
  290. next(self)
  291. def __iter__(self) -> TokenStreamIterator:
  292. return TokenStreamIterator(self)
  293. def __bool__(self) -> bool:
  294. return bool(self._pushed) or self.current.type is not TOKEN_EOF
  295. @property
  296. def eos(self) -> bool:
  297. """Are we at the end of the stream?"""
  298. return not self
  299. def push(self, token: Token) -> None:
  300. """Push a token back to the stream."""
  301. self._pushed.append(token)
  302. def look(self) -> Token:
  303. """Look at the next token."""
  304. old_token = next(self)
  305. result = self.current
  306. self.push(result)
  307. self.current = old_token
  308. return result
  309. def skip(self, n: int = 1) -> None:
  310. """Got n tokens ahead."""
  311. for _ in range(n):
  312. next(self)
  313. def next_if(self, expr: str) -> t.Optional[Token]:
  314. """Perform the token test and return the token if it matched.
  315. Otherwise the return value is `None`.
  316. """
  317. if self.current.test(expr):
  318. return next(self)
  319. return None
  320. def skip_if(self, expr: str) -> bool:
  321. """Like :meth:`next_if` but only returns `True` or `False`."""
  322. return self.next_if(expr) is not None
  323. def __next__(self) -> Token:
  324. """Go one token ahead and return the old one.
  325. Use the built-in :func:`next` instead of calling this directly.
  326. """
  327. rv = self.current
  328. if self._pushed:
  329. self.current = self._pushed.popleft()
  330. elif self.current.type is not TOKEN_EOF:
  331. try:
  332. self.current = next(self._iter)
  333. except StopIteration:
  334. self.close()
  335. return rv
  336. def close(self) -> None:
  337. """Close the stream."""
  338. self.current = Token(self.current.lineno, TOKEN_EOF, "")
  339. self._iter = iter(())
  340. self.closed = True
  341. def expect(self, expr: str) -> Token:
  342. """Expect a given token type and return it. This accepts the same
  343. argument as :meth:`jinja2.lexer.Token.test`.
  344. """
  345. if not self.current.test(expr):
  346. expr = describe_token_expr(expr)
  347. if self.current.type is TOKEN_EOF:
  348. raise TemplateSyntaxError(
  349. f"unexpected end of template, expected {expr!r}.",
  350. self.current.lineno,
  351. self.name,
  352. self.filename,
  353. )
  354. raise TemplateSyntaxError(
  355. f"expected token {expr!r}, got {describe_token(self.current)!r}",
  356. self.current.lineno,
  357. self.name,
  358. self.filename,
  359. )
  360. return next(self)
  361. def get_lexer(environment: "Environment") -> "Lexer":
  362. """Return a lexer which is probably cached."""
  363. key = (
  364. environment.block_start_string,
  365. environment.block_end_string,
  366. environment.variable_start_string,
  367. environment.variable_end_string,
  368. environment.comment_start_string,
  369. environment.comment_end_string,
  370. environment.line_statement_prefix,
  371. environment.line_comment_prefix,
  372. environment.trim_blocks,
  373. environment.lstrip_blocks,
  374. environment.newline_sequence,
  375. environment.keep_trailing_newline,
  376. )
  377. lexer = _lexer_cache.get(key)
  378. if lexer is None:
  379. _lexer_cache[key] = lexer = Lexer(environment)
  380. return lexer
  381. class OptionalLStrip(tuple):
  382. """A special tuple for marking a point in the state that can have
  383. lstrip applied.
  384. """
  385. __slots__ = ()
  386. # Even though it looks like a no-op, creating instances fails
  387. # without this.
  388. def __new__(cls, *members, **kwargs): # type: ignore
  389. return super().__new__(cls, members)
  390. class _Rule(t.NamedTuple):
  391. pattern: t.Pattern[str]
  392. tokens: t.Union[str, t.Tuple[str, ...], t.Tuple[Failure]]
  393. command: t.Optional[str]
  394. class Lexer:
  395. """Class that implements a lexer for a given environment. Automatically
  396. created by the environment class, usually you don't have to do that.
  397. Note that the lexer is not automatically bound to an environment.
  398. Multiple environments can share the same lexer.
  399. """
  400. def __init__(self, environment: "Environment") -> None:
  401. # shortcuts
  402. e = re.escape
  403. def c(x: str) -> t.Pattern[str]:
  404. return re.compile(x, re.M | re.S)
  405. # lexing rules for tags
  406. tag_rules: t.List[_Rule] = [
  407. _Rule(whitespace_re, TOKEN_WHITESPACE, None),
  408. _Rule(float_re, TOKEN_FLOAT, None),
  409. _Rule(integer_re, TOKEN_INTEGER, None),
  410. _Rule(name_re, TOKEN_NAME, None),
  411. _Rule(string_re, TOKEN_STRING, None),
  412. _Rule(operator_re, TOKEN_OPERATOR, None),
  413. ]
  414. # assemble the root lexing rule. because "|" is ungreedy
  415. # we have to sort by length so that the lexer continues working
  416. # as expected when we have parsing rules like <% for block and
  417. # <%= for variables. (if someone wants asp like syntax)
  418. # variables are just part of the rules if variable processing
  419. # is required.
  420. root_tag_rules = compile_rules(environment)
  421. block_start_re = e(environment.block_start_string)
  422. block_end_re = e(environment.block_end_string)
  423. comment_end_re = e(environment.comment_end_string)
  424. variable_end_re = e(environment.variable_end_string)
  425. # block suffix if trimming is enabled
  426. block_suffix_re = "\\n?" if environment.trim_blocks else ""
  427. self.lstrip_blocks = environment.lstrip_blocks
  428. self.newline_sequence = environment.newline_sequence
  429. self.keep_trailing_newline = environment.keep_trailing_newline
  430. root_raw_re = (
  431. rf"(?P<raw_begin>{block_start_re}(\-|\+|)\s*raw\s*"
  432. rf"(?:\-{block_end_re}\s*|{block_end_re}))"
  433. )
  434. root_parts_re = "|".join(
  435. [root_raw_re] + [rf"(?P<{n}>{r}(\-|\+|))" for n, r in root_tag_rules]
  436. )
  437. # global lexing rules
  438. self.rules: t.Dict[str, t.List[_Rule]] = {
  439. "root": [
  440. # directives
  441. _Rule(
  442. c(rf"(.*?)(?:{root_parts_re})"),
  443. OptionalLStrip(TOKEN_DATA, "#bygroup"), # type: ignore
  444. "#bygroup",
  445. ),
  446. # data
  447. _Rule(c(".+"), TOKEN_DATA, None),
  448. ],
  449. # comments
  450. TOKEN_COMMENT_BEGIN: [
  451. _Rule(
  452. c(
  453. rf"(.*?)((?:\+{comment_end_re}|\-{comment_end_re}\s*"
  454. rf"|{comment_end_re}{block_suffix_re}))"
  455. ),
  456. (TOKEN_COMMENT, TOKEN_COMMENT_END),
  457. "#pop",
  458. ),
  459. _Rule(c(r"(.)"), (Failure("Missing end of comment tag"),), None),
  460. ],
  461. # blocks
  462. TOKEN_BLOCK_BEGIN: [
  463. _Rule(
  464. c(
  465. rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
  466. rf"|{block_end_re}{block_suffix_re})"
  467. ),
  468. TOKEN_BLOCK_END,
  469. "#pop",
  470. ),
  471. ]
  472. + tag_rules,
  473. # variables
  474. TOKEN_VARIABLE_BEGIN: [
  475. _Rule(
  476. c(rf"\-{variable_end_re}\s*|{variable_end_re}"),
  477. TOKEN_VARIABLE_END,
  478. "#pop",
  479. )
  480. ]
  481. + tag_rules,
  482. # raw block
  483. TOKEN_RAW_BEGIN: [
  484. _Rule(
  485. c(
  486. rf"(.*?)((?:{block_start_re}(\-|\+|))\s*endraw\s*"
  487. rf"(?:\+{block_end_re}|\-{block_end_re}\s*"
  488. rf"|{block_end_re}{block_suffix_re}))"
  489. ),
  490. OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END), # type: ignore
  491. "#pop",
  492. ),
  493. _Rule(c(r"(.)"), (Failure("Missing end of raw directive"),), None),
  494. ],
  495. # line statements
  496. TOKEN_LINESTATEMENT_BEGIN: [
  497. _Rule(c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop")
  498. ]
  499. + tag_rules,
  500. # line comments
  501. TOKEN_LINECOMMENT_BEGIN: [
  502. _Rule(
  503. c(r"(.*?)()(?=\n|$)"),
  504. (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END),
  505. "#pop",
  506. )
  507. ],
  508. }
  509. def _normalize_newlines(self, value: str) -> str:
  510. """Replace all newlines with the configured sequence in strings
  511. and template data.
  512. """
  513. return newline_re.sub(self.newline_sequence, value)
  514. def tokenize(
  515. self,
  516. source: str,
  517. name: t.Optional[str] = None,
  518. filename: t.Optional[str] = None,
  519. state: t.Optional[str] = None,
  520. ) -> TokenStream:
  521. """Calls tokeniter + tokenize and wraps it in a token stream."""
  522. stream = self.tokeniter(source, name, filename, state)
  523. return TokenStream(self.wrap(stream, name, filename), name, filename)
  524. def wrap(
  525. self,
  526. stream: t.Iterable[t.Tuple[int, str, str]],
  527. name: t.Optional[str] = None,
  528. filename: t.Optional[str] = None,
  529. ) -> t.Iterator[Token]:
  530. """This is called with the stream as returned by `tokenize` and wraps
  531. every token in a :class:`Token` and converts the value.
  532. """
  533. for lineno, token, value_str in stream:
  534. if token in ignored_tokens:
  535. continue
  536. value: t.Any = value_str
  537. if token == TOKEN_LINESTATEMENT_BEGIN:
  538. token = TOKEN_BLOCK_BEGIN
  539. elif token == TOKEN_LINESTATEMENT_END:
  540. token = TOKEN_BLOCK_END
  541. # we are not interested in those tokens in the parser
  542. elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END):
  543. continue
  544. elif token == TOKEN_DATA:
  545. value = self._normalize_newlines(value_str)
  546. elif token == "keyword":
  547. token = value_str
  548. elif token == TOKEN_NAME:
  549. value = value_str
  550. if not value.isidentifier():
  551. raise TemplateSyntaxError(
  552. "Invalid character in identifier", lineno, name, filename
  553. )
  554. elif token == TOKEN_STRING:
  555. # try to unescape string
  556. try:
  557. value = (
  558. self._normalize_newlines(value_str[1:-1])
  559. .encode("ascii", "backslashreplace")
  560. .decode("unicode-escape")
  561. )
  562. except Exception as e:
  563. msg = str(e).split(":")[-1].strip()
  564. raise TemplateSyntaxError(msg, lineno, name, filename) from e
  565. elif token == TOKEN_INTEGER:
  566. value = int(value_str.replace("_", ""), 0)
  567. elif token == TOKEN_FLOAT:
  568. # remove all "_" first to support more Python versions
  569. value = literal_eval(value_str.replace("_", ""))
  570. elif token == TOKEN_OPERATOR:
  571. token = operators[value_str]
  572. yield Token(lineno, token, value)
  573. def tokeniter(
  574. self,
  575. source: str,
  576. name: t.Optional[str],
  577. filename: t.Optional[str] = None,
  578. state: t.Optional[str] = None,
  579. ) -> t.Iterator[t.Tuple[int, str, str]]:
  580. """This method tokenizes the text and returns the tokens in a
  581. generator. Use this method if you just want to tokenize a template.
  582. .. versionchanged:: 3.0
  583. Only ``\\n``, ``\\r\\n`` and ``\\r`` are treated as line
  584. breaks.
  585. """
  586. lines = newline_re.split(source)[::2]
  587. if not self.keep_trailing_newline and lines[-1] == "":
  588. del lines[-1]
  589. source = "\n".join(lines)
  590. pos = 0
  591. lineno = 1
  592. stack = ["root"]
  593. if state is not None and state != "root":
  594. assert state in ("variable", "block"), "invalid state"
  595. stack.append(state + "_begin")
  596. statetokens = self.rules[stack[-1]]
  597. source_length = len(source)
  598. balancing_stack: t.List[str] = []
  599. newlines_stripped = 0
  600. line_starting = True
  601. while True:
  602. # tokenizer loop
  603. for regex, tokens, new_state in statetokens:
  604. m = regex.match(source, pos)
  605. # if no match we try again with the next rule
  606. if m is None:
  607. continue
  608. # we only match blocks and variables if braces / parentheses
  609. # are balanced. continue parsing with the lower rule which
  610. # is the operator rule. do this only if the end tags look
  611. # like operators
  612. if balancing_stack and tokens in (
  613. TOKEN_VARIABLE_END,
  614. TOKEN_BLOCK_END,
  615. TOKEN_LINESTATEMENT_END,
  616. ):
  617. continue
  618. # tuples support more options
  619. if isinstance(tokens, tuple):
  620. groups: t.Sequence[str] = m.groups()
  621. if isinstance(tokens, OptionalLStrip):
  622. # Rule supports lstrip. Match will look like
  623. # text, block type, whitespace control, type, control, ...
  624. text = groups[0]
  625. # Skipping the text and first type, every other group is the
  626. # whitespace control for each type. One of the groups will be
  627. # -, +, or empty string instead of None.
  628. strip_sign = next(g for g in groups[2::2] if g is not None)
  629. if strip_sign == "-":
  630. # Strip all whitespace between the text and the tag.
  631. stripped = text.rstrip()
  632. newlines_stripped = text[len(stripped) :].count("\n")
  633. groups = [stripped, *groups[1:]]
  634. elif (
  635. # Not marked for preserving whitespace.
  636. strip_sign != "+"
  637. # lstrip is enabled.
  638. and self.lstrip_blocks
  639. # Not a variable expression.
  640. and not m.groupdict().get(TOKEN_VARIABLE_BEGIN)
  641. ):
  642. # The start of text between the last newline and the tag.
  643. l_pos = text.rfind("\n") + 1
  644. if l_pos > 0 or line_starting:
  645. # If there's only whitespace between the newline and the
  646. # tag, strip it.
  647. if whitespace_re.fullmatch(text, l_pos):
  648. groups = [text[:l_pos], *groups[1:]]
  649. for idx, token in enumerate(tokens):
  650. # failure group
  651. if token.__class__ is Failure:
  652. raise token(lineno, filename)
  653. # bygroup is a bit more complex, in that case we
  654. # yield for the current token the first named
  655. # group that matched
  656. elif token == "#bygroup":
  657. for key, value in m.groupdict().items():
  658. if value is not None:
  659. yield lineno, key, value
  660. lineno += value.count("\n")
  661. break
  662. else:
  663. raise RuntimeError(
  664. f"{regex!r} wanted to resolve the token dynamically"
  665. " but no group matched"
  666. )
  667. # normal group
  668. else:
  669. data = groups[idx]
  670. if data or token not in ignore_if_empty:
  671. yield lineno, token, data
  672. lineno += data.count("\n") + newlines_stripped
  673. newlines_stripped = 0
  674. # strings as token just are yielded as it.
  675. else:
  676. data = m.group()
  677. # update brace/parentheses balance
  678. if tokens == TOKEN_OPERATOR:
  679. if data == "{":
  680. balancing_stack.append("}")
  681. elif data == "(":
  682. balancing_stack.append(")")
  683. elif data == "[":
  684. balancing_stack.append("]")
  685. elif data in ("}", ")", "]"):
  686. if not balancing_stack:
  687. raise TemplateSyntaxError(
  688. f"unexpected '{data}'", lineno, name, filename
  689. )
  690. expected_op = balancing_stack.pop()
  691. if expected_op != data:
  692. raise TemplateSyntaxError(
  693. f"unexpected '{data}', expected '{expected_op}'",
  694. lineno,
  695. name,
  696. filename,
  697. )
  698. # yield items
  699. if data or tokens not in ignore_if_empty:
  700. yield lineno, tokens, data
  701. lineno += data.count("\n")
  702. line_starting = m.group()[-1:] == "\n"
  703. # fetch new position into new variable so that we can check
  704. # if there is a internal parsing error which would result
  705. # in an infinite loop
  706. pos2 = m.end()
  707. # handle state changes
  708. if new_state is not None:
  709. # remove the uppermost state
  710. if new_state == "#pop":
  711. stack.pop()
  712. # resolve the new state by group checking
  713. elif new_state == "#bygroup":
  714. for key, value in m.groupdict().items():
  715. if value is not None:
  716. stack.append(key)
  717. break
  718. else:
  719. raise RuntimeError(
  720. f"{regex!r} wanted to resolve the new state dynamically"
  721. f" but no group matched"
  722. )
  723. # direct state name given
  724. else:
  725. stack.append(new_state)
  726. statetokens = self.rules[stack[-1]]
  727. # we are still at the same position and no stack change.
  728. # this means a loop without break condition, avoid that and
  729. # raise error
  730. elif pos2 == pos:
  731. raise RuntimeError(
  732. f"{regex!r} yielded empty string without stack change"
  733. )
  734. # publish new function and start again
  735. pos = pos2
  736. break
  737. # if loop terminated without break we haven't found a single match
  738. # either we are at the end of the file or we have a problem
  739. else:
  740. # end of text
  741. if pos >= source_length:
  742. return
  743. # something went wrong
  744. raise TemplateSyntaxError(
  745. f"unexpected char {source[pos]!r} at {pos}", lineno, name, filename
  746. )